library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
Parameters
DIST_METHOD = 'Manhattan'
errMethod = ifelse(DIST_METHOD == 'Manhattan','MAE','RMSE')
#weightFormula = function(x){ 1/(x)} #inverse of distance
weightFormula = function(x){ 1/(x ^ 2)} #inverse of sq distance
# Keep only required MAC Addresses
keepMacs = c(
'00:0f:a3:39:e1:c0', #default
#'00:0f:a3:39:dd:cd', #added
'00:14:bf:b1:97:8a',
'00:14:bf:3b:c7:c6',
'00:14:bf:b1:97:90',
'00:14:bf:b1:97:8d',
'00:14:bf:b1:97:81'
)
Files
offline_file = "../../data/offline.final.trace.txt"
online_file = "../../data/online.final.trace.txt"
Functions (Process Raw Data)
# Create a function to parse the data
processLine = function(x){
# Split up the line on ';', '=' and ','
tokens = strsplit(x, "[;=,]")[[1]]
# The hand held device (the one for which we need to determine the position)
# infromation is contained in the 1st 10 tokens (refer to book page 9)
# If no scanned signal values, return NULL
if (length(tokens) == 10) {
return(NULL)
}
# The tokens after the 10th one representthe signal strength at the access points (book page 9).
# Split up the tokens into individual measurements (each measurement contains 4 data points)
# 4 points are: MAC address, Signal, Channel and Device Type
# Device Type 3 is what is important (book page 6)
tmp = matrix(data = tokens[ - (1:10) ], ncol = 4, byrow = TRUE)
# Combine signal measurement with the h
cbind(matrix(tokens[c(2, 4, 6:8, 10)], nrow(tmp), 6, byrow = TRUE), tmp)
}
#' @description Function to read the data, clean it and process it into an appropriate format
#' @param file Filename to be read in
#' @param keepMacs a list of MAC addresses to keep
#' @returns A dataframe
readData = function(file, keepMacs=NULL){
# Read in the raw "offline" text file
txt = readLines(file)
##############################
#### Process the raw data ####
##############################
# Parse the data
lines = txt[substr(txt, 1, 1) != "#" ]
tmp = lapply(lines, processLine)
# Convert the data to a data frame
data = as.data.frame(do.call("rbind", tmp), stringsAsFactors = FALSE)
######################################################################
#### Cleaning the Data and Building a Representation for Analysis ####
######################################################################
# Assign column names to the offline data frame
names(data) = c(
"time", "scanMac", "posX", "posY", "posZ",
"orientation", "mac", "signal",
"channel", "type"
)
numVars = c("time", "posX", "posY", "posZ", "orientation", "signal")
data[numVars] = lapply(data[numVars], as.numeric)
# Keep only required device types (remove adhoc)
data = data[data$type != 1, ]
# Keep only required MAC Addresses
data = data[data$mac %in% keepMacs, ]
# # From book page 13
# data$rawTime = data$time
# data$time = data$time/1000
# class(data$time) = c("POSIXt", "POSIXct")
# Discard unwanted columns that dont add any additional information
data = data[ , !(names(data) %in% c("scanMac", "posZ"))]
# Cleanup Orientation
data$angle = roundOrientation(data$orientation)
# Add position identifier
data$posXY = paste(data$posX, data$posY, sep = "-")
return(data)
}
Offline data
numMacs = length(keepMacs)
numMacs
## [1] 6
roundOrientation = function(angles) {
refs = seq(0, by = 45, length = 9)
q = sapply(angles, function(o) which.min(abs(o - refs)))
c(refs[1:8], 0)[q]
}
offline = readData(file = offline_file, keepMacs = keepMacs)
dim(offline)
## [1] 769332 10
length(unique(offline$posXY))
## [1] 166
Online Data
online = readData(file = online_file, keepMacs = keepMacs)
dim(online)
## [1] 34778 10
length(unique(online$posXY))
## [1] 60
Function (Reshape)
# This is equivalent to the tall2wide function
reshapeSS = function(data, varSignal = "signal", keepVars = c("posXY", "posX", "posY"), sampleAngle = FALSE) {
refs = seq(0, by = 45, length = 8)
byLocation =
with(
data,
by(
data,
list(posXY),
function(x) {
if (sampleAngle) x = x[x$angle == sample(refs, size = 1), ]
ans = x[1, keepVars]
avgSS = tapply(x[ , varSignal ], x$mac, mean)
y = matrix(avgSS, nrow = 1, ncol = numMacs,
dimnames = list(ans$posXY, names(avgSS)))
cbind(ans, y)
}
)
)
newDataSS = do.call("rbind", byLocation)
return(newDataSS)
}
Reshape Test Data
keepVars = c("posXY", "posX","posY", "orientation", "angle")
onlineSummary = reshapeSS(data = online, varSignal = "signal", keepVars = keepVars)
head(onlineSummary,10)
## posXY posX posY orientation angle 00:0f:a3:39:e1:c0
## 0-0.05 0-0.05 0.00 0.05 130.5 135 -52.22727
## 0.15-9.42 0.15-9.42 0.15 9.42 112.3 90 -55.27523
## 0.31-11.09 0.31-11.09 0.31 11.09 230.1 225 -51.70909
## 0.47-8.2 0.47-8.2 0.47 8.20 5.8 0 -49.50000
## 0.78-10.94 0.78-10.94 0.78 10.94 348.3 0 -53.26364
## 0.93-11.69 0.93-11.69 0.93 11.69 158.3 180 -57.96364
## 1.08-12.19 1.08-12.19 1.08 12.19 229.1 225 -54.82727
## 1.24-3.93 1.24-3.93 1.24 3.93 261.5 270 -56.47273
## 1.39-6.61 1.39-6.61 1.39 6.61 114.1 135 -51.28182
## 1.52-9.32 1.52-9.32 1.52 9.32 7.0 0 -50.36697
## 00:14:bf:3b:c7:c6 00:14:bf:b1:97:81 00:14:bf:b1:97:8a
## 0-0.05 -62.94898 -61.81395 -40.06897
## 0.15-9.42 -73.96190 -72.70103 -47.81308
## 0.31-11.09 -70.08247 -70.09890 -54.08824
## 0.47-8.2 -64.25806 -72.59770 -45.65289
## 0.78-10.94 -66.96000 -66.80952 -48.41379
## 0.93-11.69 -70.44340 -70.58025 -43.66346
## 1.08-12.19 -69.20192 -67.92553 -52.00820
## 1.24-3.93 -69.62745 -59.76136 -38.91753
## 1.39-6.61 -62.23913 -64.56627 -48.92381
## 1.52-9.32 -63.35922 -67.48913 -50.04167
## 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0.05 -63.04301 -55.23333
## 0.15-9.42 -69.45455 -46.88000
## 0.31-11.09 -69.13158 -53.88660
## 0.47-8.2 -60.79747 -49.58000
## 0.78-10.94 -65.00000 -54.84694
## 0.93-11.69 -65.59302 -47.27083
## 1.08-12.19 -71.58696 -51.66667
## 1.24-3.93 -71.66667 -53.23333
## 1.39-6.61 -60.79798 -50.49057
## 1.52-9.32 -65.10345 -49.38542
Function (Select Train Data)
#' @description Selectes the appropriate observations (based on test data orientation) from the original tall data
#' and reformats it such that it can be used for training the KNN algorithm
#' @param angleNewObs Angle (Orientation) of the test observation
#' @param train_data Original tall-skinny data
#' @param m Keep the 'm' closest orientations to angleNewObs
#' @returns A dataframe suitable for training
selectTrain = function(angleNewObs, train_data, m){
# Find the angles to keep
nearestAngle = roundOrientation(angles = angleNewObs)
if (m %% 2 == 1) {
angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
} else {
m = m + 1
angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
if (sign(angleNewObs - nearestAngle) > -1)
angles = angles[ -1 ]
else
angles = angles[ -m ]
}
angles = angles + nearestAngle
angles[angles < 0] = angles[ angles < 0 ] + 360
angles[angles > 360] = angles[ angles > 360 ] - 360
# Subset only those angles from original data (tall-skinny)
train_data_subset = train_data[train_data$angle %in% angles, ]
# Convert to Wide and average the data for the same positions
train_data_subset = reshapeSS(data = train_data_subset, varSignal = "signal")
return(train_data_subset)
}
Nearest Neighbors
Common Functions
#' @description Computes the distance of the new signal (single observation) to each observation in the training dataset
#' @param newSignals The Signal Values for the validation data for each observation
#' @param trainSubset The training data to be used
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A dataframe containing same number of rows as that in the training data.
#' The observations are ordered by the distance to the new signal. Each row contains 5 columns.
#' 1st column is the XY location of the training observation (string)
#' 2nd column is the X location of the training observation (float)
#' 3rd column is the Y location of the training observation (float)
#' 4th column is the distance to the point under consideration to the training observation (float)
#' 5th column is the inverse distance or weight (float). Weight is hard coded to 1 for all observations if weighted = FALSE
findNN = function(newSignal, trainSubset, weighted=FALSE, method = DIST_METHOD) {
diffs = apply(trainSubset[ , 4:(4+numMacs-1)], 1, function(x) x - newSignal)
if(method=='Euclidean') dists = apply(diffs, 2, function(x) sqrt(sum(x^2)) ) #RSE
if(method=='Manhattan') dists = apply(diffs, 2, function(x) sum(abs(x)) ) #AE
closest = order(dists)
ordered_dist = dists[closest]
if(weighted == TRUE){
weight = weightFormula(ordered_dist)
}
if(weighted == FALSE){
weight = rep(1, length(dists))
}
return(cbind(trainSubset[closest, 1:3], ordered_dist, weight))
}
#' @description XY Prediction for a single value of k (num neighbors)
#' @param newSignals The Signal Values for the validation data for each observation
#' @param newAngles The Orientation of the validation data for each observation
#' @param trainData The training data to be used
#' @param numAngles Number of closest reference angles to include in the data
#' @param k Perform the predicton for num neighbors = k
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A dataframe with num rows = number of (validation) observations and num columns = 2
#' Each row indicates the prediction of the mean X and Y values for that observation
predXY = function(newSignals, newAngles, trainData, numAngles = 1, k = 3, weighted=FALSE){
closeXY = list(length = nrow(newSignals))
for (i in 1:nrow(newSignals)) {
trainSS = selectTrain(newAngles[i], trainData, m = numAngles)
closeXY[[i]] = findNN(
newSignal = as.numeric(newSignals[i, ]),
trainSubset = trainSS,
weighted = weighted
)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and single value of neighbors
#' @param x Dataframe containing 5 columns
#' 1st column is the XY location (string)
#' 2nd column is the X location (float)
#' 3rd column is the Y location (float)
#' 4th column is the distance to the point under consideration (float)
#' 5th column is the inverse distance or weight (float)
#' @param k Number of nearest neighbors to use
#' @return A pair of XY mean values for k number of neighbors
k_means_single_obs = function(x, k){
weights = x[1:k, 5]
weighted_x = sum(x[1:k, 2] * weights) / sum(weights)
weighted_y = sum(x[1:k, 3] * weights) / sum(weights)
return(c(weighted_x, weighted_y))
}
# estXY = lapply(closeXY, function(x) sapply(x[ , 2:3], function(x) mean(x[1:k])))
estXY = lapply(closeXY, k_means_single_obs, k)
estXY = do.call("rbind", estXY)
return(estXY)
}
calcError = function(estXY, actualXY, method = DIST_METHOD){
if('numeric' %in% class(estXY)) rows = 1 else rows = nrow(estXY)
if(method == 'Euclidean') er = sqrt(sum(rowSums((estXY - actualXY)^2)))/rows
if(method == 'Manhattan') er = sum(rowSums(abs(estXY - actualXY)))/rows
return(er)
}
K-Fold
Setup
set.seed(42)
K = 20
v = 11
allNeighbors = c(1:K)
allNeighbors
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
allAngles = c(1:8)
allAngles
## [1] 1 2 3 4 5 6 7 8
permuteLocs = sample(unique(offline$posXY))
permuteLocs = matrix(permuteLocs, ncol = v, nrow = floor(length(permuteLocs)/v))
## Warning in matrix(permuteLocs, ncol = v, nrow = floor(length(permuteLocs)/v)):
## data length [166] is not a sub-multiple or multiple of the number of rows [15]
permuteLocs
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] "11-3" "24-6" "27-8" "25-8" "7-7" "24-8" "22-8" "12-5" "0-12" "0-7"
## [2,] "9-7" "21-6" "2-9" "24-7" "5-8" "15-8" "11-8" "32-6" "2-7" "29-8"
## [3,] "14-7" "30-3" "12-7" "23-4" "2-2" "10-5" "4-7" "17-8" "0-11" "17-7"
## [4,] "13-8" "3-3" "32-8" "30-7" "2-8" "33-7" "25-3" "1-13" "12-6" "24-3"
## [5,] "33-8" "15-7" "30-8" "0-3" "20-3" "21-3" "2-11" "13-3" "8-8" "22-3"
## [6,] "21-8" "0-8" "0-1" "2-13" "1-5" "0-4" "0-0" "23-3" "1-10" "16-8"
## [7,] "20-8" "24-5" "26-3" "31-8" "31-3" "10-4" "3-8" "2-5" "8-7" "0-9"
## [8,] "27-3" "1-1" "32-3" "2-6" "13-7" "11-4" "26-6" "8-3" "26-4" "1-3"
## [9,] "9-3" "19-3" "2-1" "14-8" "33-4" "6-8" "5-7" "1-4" "13-6" "4-8"
## [10,] "1-12" "25-4" "20-7" "24-4" "26-7" "0-13" "22-4" "19-7" "33-3" "18-8"
## [11,] "10-8" "2-0" "1-0" "16-7" "7-8" "3-7" "6-7" "26-8" "2-4" "32-5"
## [12,] "21-4" "11-5" "21-7" "27-7" "32-7" "13-5" "15-3" "9-4" "7-3" "22-7"
## [13,] "16-3" "4-3" "28-8" "21-5" "1-6" "22-5" "19-8" "10-7" "1-11" "25-7"
## [14,] "26-5" "18-7" "23-7" "12-8" "29-3" "23-5" "13-4" "2-10" "2-3" "10-3"
## [15,] "1-8" "5-3" "1-9" "0-10" "12-4" "31-7" "28-3" "11-7" "23-6" "28-7"
## [,11]
## [1,] "10-6"
## [2,] "1-7"
## [3,] "23-8"
## [4,] "11-6"
## [5,] "14-3"
## [6,] "12-3"
## [7,] "29-7"
## [8,] "22-6"
## [9,] "18-3"
## [10,] "17-3"
## [11,] "9-8"
## [12,] "6-3"
## [13,] "32-4"
## [14,] "0-2"
## [15,] "1-2"
onlineFold = subset(offline, posXY %in% permuteLocs[ , 1])
head(onlineFold)
## time posX posY orientation mac signal channel
## 154732 1.139653e+12 1 8 0.7 00:14:bf:b1:97:8a -47 2437000000
## 154733 1.139653e+12 1 8 0.7 00:14:bf:b1:97:8d -52 2442000000
## 154734 1.139653e+12 1 8 0.7 00:0f:a3:39:e1:c0 -52 2462000000
## 154735 1.139653e+12 1 8 0.7 00:14:bf:b1:97:90 -51 2427000000
## 154736 1.139653e+12 1 8 0.7 00:14:bf:3b:c7:c6 -68 2432000000
## 154737 1.139653e+12 1 8 0.7 00:14:bf:b1:97:81 -67 2422000000
## type angle posXY
## 154732 3 0 1-8
## 154733 3 0 1-8
## 154734 3 0 1-8
## 154735 3 0 1-8
## 154736 3 0 1-8
## 154737 3 0 1-8
# For reference
head(onlineSummary)
## posXY posX posY orientation angle 00:0f:a3:39:e1:c0
## 0-0.05 0-0.05 0.00 0.05 130.5 135 -52.22727
## 0.15-9.42 0.15-9.42 0.15 9.42 112.3 90 -55.27523
## 0.31-11.09 0.31-11.09 0.31 11.09 230.1 225 -51.70909
## 0.47-8.2 0.47-8.2 0.47 8.20 5.8 0 -49.50000
## 0.78-10.94 0.78-10.94 0.78 10.94 348.3 0 -53.26364
## 0.93-11.69 0.93-11.69 0.93 11.69 158.3 180 -57.96364
## 00:14:bf:3b:c7:c6 00:14:bf:b1:97:81 00:14:bf:b1:97:8a
## 0-0.05 -62.94898 -61.81395 -40.06897
## 0.15-9.42 -73.96190 -72.70103 -47.81308
## 0.31-11.09 -70.08247 -70.09890 -54.08824
## 0.47-8.2 -64.25806 -72.59770 -45.65289
## 0.78-10.94 -66.96000 -66.80952 -48.41379
## 0.93-11.69 -70.44340 -70.58025 -43.66346
## 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0.05 -63.04301 -55.23333
## 0.15-9.42 -69.45455 -46.88000
## 0.31-11.09 -69.13158 -53.88660
## 0.47-8.2 -60.79747 -49.58000
## 0.78-10.94 -65.00000 -54.84694
## 0.93-11.69 -65.59302 -47.27083
keepVars = c("posXY", "posX","posY", "orientation", "angle")
onlineCVSummary = reshapeSS(offline, keepVars = keepVars, sampleAngle = TRUE)
head(onlineCVSummary)
## posXY posX posY orientation angle 00:0f:a3:39:e1:c0 00:14:bf:3b:c7:c6
## 0-0 0-0 0 0 90.3 90 -50.58559 -62.59551
## 0-1 0-1 0 1 89.8 90 -55.17273 -65.70000
## 0-10 0-10 0 10 225.3 225 -51.47706 -63.01163
## 0-11 0-11 0 11 135.2 135 -50.16364 -67.87778
## 0-12 0-12 0 12 135.0 135 -52.52727 -68.90816
## 0-13 0-13 0 13 135.2 135 -54.91818 -71.76190
## 00:14:bf:b1:97:81 00:14:bf:b1:97:8a 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0 -63.78261 -33.74737 -63.12941 -55.19588
## 0-1 -63.94186 -40.21782 -63.52381 -60.47826
## 0-10 -68.58416 -49.01010 -66.71111 -54.56180
## 0-11 -72.34000 -47.57895 -66.17241 -53.97000
## 0-12 -70.32222 -43.11009 -63.73494 -48.99000
## 0-13 -74.19588 -42.49438 -67.70423 -50.82828
# First Fold (validation)
onlineFold = subset(onlineCVSummary, posXY %in% permuteLocs[ , 1])
head(onlineCVSummary,01)
## posXY posX posY orientation angle 00:0f:a3:39:e1:c0 00:14:bf:3b:c7:c6
## 0-0 0-0 0 0 90.3 90 -50.58559 -62.59551
## 00:14:bf:b1:97:81 00:14:bf:b1:97:8a 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0 -63.78261 -33.74737 -63.12941 -55.19588
# First Fold (Train)
offlineFold = subset(offline, posXY %in% permuteLocs[ , -1])
head(offlineFold)
## time posX posY orientation mac signal channel type
## 1 1.139643e+12 0 0 0 00:14:bf:b1:97:8a -38 2437000000 3
## 2 1.139643e+12 0 0 0 00:14:bf:b1:97:90 -56 2427000000 3
## 3 1.139643e+12 0 0 0 00:0f:a3:39:e1:c0 -53 2462000000 3
## 4 1.139643e+12 0 0 0 00:14:bf:b1:97:8d -65 2442000000 3
## 5 1.139643e+12 0 0 0 00:14:bf:b1:97:81 -65 2422000000 3
## 6 1.139643e+12 0 0 0 00:14:bf:3b:c7:c6 -66 2432000000 3
## angle posXY
## 1 0 0-0
## 2 0 0-0
## 3 0 0-0
## 4 0 0-0
## 5 0 0-0
## 6 0 0-0
estFold = predXY(
newSignals = onlineFold[ , 6:(6+numMacs-1)],
newAngles = onlineFold[ , 4],
offlineFold,
numAngles = 3,
k = 3
)
actualFold = onlineFold[ , c("posX", "posY")]
calcError(estFold, actualFold)
## [1] 2.266667
Faster Cross Validation
Common Functions
#' @description Modified XY Prediction to help with faster CV for all K values at once (from 1 to K)
#' @param newSignals The Signal Values for the validation data for each observation
#' @param newAngles The Orientation of the validation data for each observation
#' @param trainData The training data to be used
#' @param numAngles Number of closest reference angles to include in the data
#' @param K Perform the prediction for num neighbors from 1 to K
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A nested dataframe with num rows = number of (validation) observations and num columns = number of folds
#' Each entry in this dataframe is a vector of 2 values
#' indicating the prediction of the mean X and Y values for that obs and num neighbors
predXYallK = function(newSignals, newAngles, trainData, numAngles = 1, K = 10, weighted=FALSE){
closeXY = list(length = nrow(newSignals))
for (i in 1:nrow(newSignals)) {
trainSS = selectTrain(newAngles[i], trainData, m = numAngles)
closeXY[[i]] = findNN(
newSignal = as.numeric(newSignals[i, ]),
trainSubset = trainSS,
weighted = weighted
)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and multiple neighor values
#' @param x Dataframe containing 5 columns
#' 1st column is the XY location (string)
#' 2nd column is the X location (float)
#' 3rd column is the Y location (float)
#' 4th column is the distance to the point under consideration (float)
#' 5th column is the inverse distance or weight (float)
#' @param K Number of nearest neighbors to use
#' @return A list of K pairs (each pair is a XY mean value for a single k)
all_K_means_single_obs = function(x, K){
# Row will contain the K mean values for k = 1 to K
rows = list()
for(k in seq(1, K)){
rows[[k]] = k_means_single_obs(x, k)
}
return(rows)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and single value of neighbors
#' @param x Dataframe containing 5 columns
#' 1st column is the XY location (string)
#' 2nd column is the X location (float)
#' 3rd column is the Y location (float)
#' 4th column is the distance to the point under consideration (float)
#' 5th column is the inverse distance or weight (float)
#' @param k Number of nearest neighbors to use
#' @return A pair of XY mean values for k number of neighbors
k_means_single_obs = function(x, k){
weights = x[1:k, 5]
weighted_x = sum(x[1:k, 2] * weights) / sum(weights)
weighted_y = sum(x[1:k, 3] * weights) / sum(weights)
return(c(weighted_x, weighted_y))
}
# estXY = lapply(closeXY, function(x) sapply(x[ , 2:3], function(x) mean(x[1:k])))
estXY = lapply(closeXY, all_K_means_single_obs, K)
estXY = do.call("rbind", estXY)
return(estXY)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and multiple neighor values
#' @param K Number of nearest neighbors to use (Will run Grid Search over all values from k = 1 to K)
#' @param v Number of folds to use
#' @param offline Use "as is" from script for now
#' @param onlineCVSummary Use "as is" from script for now
#' @param folds A matrix with rows = number of observations in each fold and columns = number of folds.
#' The values are the XY IDs to be included in that fold
#' @param numAngles Number of closest reference angles to include in the data
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A vector of K values indicating the Error for each value of k from 1 to K
run_kfold = function(K, v, offline, onlineCVSummary, folds, numAngles, weighted=FALSE){
err= rep(0, K)
errCV = rep(0, K)
allErr = data.frame()
for (j in 1:v) {
print(paste("Running Fold: ", j))
onlineFold = subset(onlineCVSummary, posXY %in% folds[ , j])
offlineFold = subset(offline, posXY %in% folds[ , -j])
actualFold = onlineFold[ , c("posX", "posY")]
estFold = predXYallK(
newSignals = onlineFold[ , 6:(6+numMacs-1)],
newAngles = onlineFold[ , 4],
trainData = offlineFold,
numAngles = numAngles,
K = K,
weighted=weighted
)
# Reformat into correct format for each 'k' value
for(k in 1:K){
estSingleK = data.frame()
for(i in seq(1, length(estFold)/K)){ # i = NUmber of the observtion
estSingleK = rbind(estSingleK, t(as.data.frame(estFold[i,k])))
}
err[k] = err[k] + calcError(estSingleK, actualFold)
errCV[k] = calcError(estSingleK, actualFold) #returning all folds
}
allErr=rbind(allErr,data.frame('fold'=j, 'numNeighbors' = 1:K,'errValue' = errCV))
}
return(list(err=err,allErr=allErr))
}
Parallel CV and Plot
get_CV = function(K,v,offline,onlineCVSummary,permuteLocs,numAngles,weighted = TRUE){
library(foreach)
library(progress)
library(doParallel)
library(doSNOW)
cl <- makeCluster(detectCores())
doSNOW::registerDoSNOW(cl)
allErrors = data.frame()
start = proc.time()
pb <- progress::progress_bar$new(total = length(allAngles),format='[:bar] :percent :eta')
progress <- function(n) pb$tick()
allErrorsCV = foreach(numAngles = allAngles
,.combine = rbind
,.options.snow = list(progress=progress)
,.export = c('run_kfold','predXYallK','reshapeSS','findNN','calcError'
,'numMacs','selectTrain','roundOrientation','DIST_METHOD'
,'weightFormula')
) %dopar% {
print(paste("Running ", v, "-fold cross validation with 1 to ", K, " neighbors, for number of Angles = ", numAngles))
err = run_kfold(
K = K,
v = v,
offline = offline,
onlineCVSummary = onlineCVSummary,
folds = permuteLocs,
numAngles = numAngles,
weighted = weighted
)
err$allErr$numAngles = numAngles
return(err$allErr)
#return(data.frame(t(err)))
}
stopCluster(cl)
stop = proc.time()
diff = stop-start
print(diff)
return(allErrorsCV)
}
find_best = function(allErrorsCV){
library('caret')
library(tidyverse)
allErrors = allErrorsCV %>%
group_by(numAngles,numNeighbors) %>%
dplyr::summarise(errValue = mean(errValue)) %>%
ungroup() %>%
mutate(errValueSD=sd(errValue)
,best=FALSE
,oneSE=FALSE)
allErrors[best(as.data.frame(allErrors),"errValue",maximize=FALSE),]$best=TRUE
allErrors[oneSE(as.data.frame(allErrors),"errValue",maximize=FALSE,num=30),]$oneSE=TRUE
return(allErrors)
}
plot_best = function(allErrors) {
p = ggplot(allErrors, aes(x=numNeighbors, y=numAngles, fill= errValue
, text=paste0("A:",numAngles," N:",numNeighbors,errMethod," :", round(errValue,3)))) +
geom_tile() +
scale_y_continuous(breaks=allAngles) +
#scale_fill_distiller(palette = "RdYlBu") +
scale_fill_gradient2(low = "green",mid='darkorange', high = "darkred", na.value = NA
,midpoint=mean(c(max(allErrors$errValue),min(allErrors$errValue)))
#,midpoint=median(Errors$errValue)
)+
#scale_fill_distiller(palette = "Blues",direction=0) +
labs(fill = errMethod) +
geom_text(data=allErrors[allErrors$best,],label='Best',size=3,nudge_y=.27) +
geom_text(data=allErrors[allErrors$oneSE,],label='1SE',size=3)
#p
ggplotly(p, tooltip="text")
}
Unweighted
allErrorsCV = get_CV(K=K,v=v
,offline=offline,onlineCVSummary=onlineCVSummary
,permuteLocs=permuteLocs,numAngles=numAngles
,weighted = FALSE)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: snow
##
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
## parCapply, parLapply, parRapply, parSapply, splitIndices,
## stopCluster
## user system elapsed
## 2.53 2.70 134.58
allErrors = find_best(allErrorsCV)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## `summarise()` regrouping output by 'numAngles' (override with `.groups` argument)
allErrors
## # A tibble: 160 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 1 1 3.43 0.194 FALSE FALSE
## 2 1 2 2.95 0.194 FALSE FALSE
## 3 1 3 2.79 0.194 FALSE FALSE
## 4 1 4 2.78 0.194 FALSE FALSE
## 5 1 5 2.78 0.194 FALSE FALSE
## 6 1 6 2.81 0.194 FALSE FALSE
## 7 1 7 2.87 0.194 FALSE FALSE
## 8 1 8 2.88 0.194 FALSE FALSE
## 9 1 9 2.95 0.194 FALSE FALSE
## 10 1 10 2.97 0.194 FALSE FALSE
## # ... with 150 more rows
print(filter(allErrors,best | oneSE))
## # A tibble: 1 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 6 2 2.70 0.194 TRUE TRUE
plot_best(allErrors)
Final Model
final = filter(allErrors,oneSE)
finalAngle = final$numAngles
finalK = final$numNeighbors
finalAngle
## [1] 6
finalK
## [1] 2
#
# final = which(allErrors == min(allErrors), arr.ind = TRUE)
#
#
# finalAngleIndex = final[1]
# finalKIndex = final[2]
# finalAngle = allAngles[finalAngleIndex]
# finalK = allNeighbors[finalKIndex]
#
# finalAngle
# finalK
actualXY = onlineSummary %>% dplyr::select(posX, posY)
estXYfinalK = predXY(
newSignals = onlineSummary[ , 6:(6+numMacs-1)],
newAngles = onlineSummary[ , 4],
trainData = offline,
numAngles = finalAngle,
k = finalK,
weighted = FALSE
)
calcError(estXYfinalK, actualXY)
## [1] 2.7525
Weighted
allErrorsCVW = get_CV(K=K,v=v
,offline=offline,onlineCVSummary=onlineCVSummary
,permuteLocs=permuteLocs,numAngles=numAngles
,weighted = TRUE)
## user system elapsed
## 2.54 2.64 119.70
allErrorsW = find_best(allErrorsCVW)
## `summarise()` regrouping output by 'numAngles' (override with `.groups` argument)
allErrorsW
## # A tibble: 160 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 1 1 3.43 0.125 FALSE FALSE
## 2 1 2 2.93 0.125 FALSE FALSE
## 3 1 3 2.77 0.125 FALSE FALSE
## 4 1 4 2.72 0.125 FALSE TRUE
## 5 1 5 2.69 0.125 TRUE FALSE
## 6 1 6 2.70 0.125 FALSE FALSE
## 7 1 7 2.73 0.125 FALSE FALSE
## 8 1 8 2.74 0.125 FALSE FALSE
## 9 1 9 2.76 0.125 FALSE FALSE
## 10 1 10 2.78 0.125 FALSE FALSE
## # ... with 150 more rows
print(filter(allErrorsW,best | oneSE))
## # A tibble: 2 x 6
## numAngles numNeighbors errValue errValueSD best oneSE
## <int> <int> <dbl> <dbl> <lgl> <lgl>
## 1 1 4 2.72 0.125 FALSE TRUE
## 2 1 5 2.69 0.125 TRUE FALSE
plot_best(allErrorsW)
Final Model
finalW = filter(allErrorsW,oneSE)
finalAngleW = final$numAngles
finalKW = final$numNeighbors
finalAngleW
## [1] 6
finalKW
## [1] 2
actualXY = onlineSummary %>% dplyr::select(posX, posY)
estXYfinalKW = predXY(
newSignals = onlineSummary[ , 6:(6+numMacs-1)],
newAngles = onlineSummary[ , 4],
trainData = offline,
numAngles = finalAngleW,
k = finalKW,
weighted = TRUE
)
calcError(estXYfinalKW, actualXY)
## [1] 2.732295